EA count data

# WD
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

# Load data
counts <- read_csv("TraC_Fish_Counts_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join count data with site information
counts_clean <- counts %>% left_join(sites, by = "SITE_ID") %>%
  mutate(EVENT_DATE = ymd(EVENT_DATE),
         EVENT_YEAR = year(EVENT_DATE),
         EVENT_MONTH = month(EVENT_DATE),
         SEASON = if_else(EVENT_MONTH %in% 4:9, "Summer", "Winter"))

counts_clean <- counts_clean %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    SURVEY_ID,
    EVENT_DATE,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_COUNT,
    NO_OF_RUNS,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING)

# Convert dates and extract components
counts_clean <- counts_clean %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_DATE_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE))

# Sampling method
counts_clean <- counts_clean %>%
  mutate(
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

# Thames only
counts_thames <- counts_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)))

# Lat and long
sites_sf <- st_as_sf(counts_thames, coords = c("SITE_RANKED_EASTING", "SITE_RANKED_NORTHING"), crs = 27700) %>%
  st_transform(crs = 4326)
coords <- st_coordinates(sites_sf)
sites_sf$lon <- coords[, 1]
sites_sf$lat <- coords[, 2]

# Gears
sites_sf <- sites_sf %>%
  mutate(
    METHOD_TYPE = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

##### Map #####
method_types <- unique(sites_sf$METHOD_TYPE)
method_pal <- colorFactor(palette = "Set2", domain = method_types)

leaflet(sites_sf) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    lng = ~lon, lat = ~lat,
    color = ~method_pal(METHOD_TYPE),
    radius = 6,
    stroke = TRUE, weight = 1, fillOpacity = 0.9,
    popup = ~paste0(
      "<b>Site:</b> ", SITE_NAME, "<br>",
      "<b>Zone:</b> ", TOP_TIER_SITE, "<br>",
      "<b>METHOD:</b> ", METHOD_TYPE), group = ~METHOD_TYPE) %>%
  addLegend(
    position = "bottomright",
    pal = method_pal,
    values = ~METHOD_TYPE,
    title = "Method type",
    opacity = 1) %>%
  addLayersControl(
    overlayGroups = method_types,
    options = layersControlOptions(collapsed = FALSE),
    position = "topright") %>%
  addResetMapButton()
# RA
method_totals <- counts_thames %>%
  filter(!is.na(METHOD)) %>%
  group_by(METHOD) %>%
  summarise(
    Total_Individuals = sum(FISH_COUNT, na.rm = TRUE),
    Total_Species = n_distinct(SPECIES_NAME),
    .groups = "drop")

# Get grand total
grand_total <- sum(method_totals$Total_Individuals)

# Add relative abundance column
method_totals <- method_totals %>%
  mutate(
    Relative_Abundance = round(100 * Total_Individuals / grand_total, 1)
  ) %>%
  arrange(desc(Relative_Abundance)) %>%
  mutate(METHOD = factor(METHOD, levels = unique(METHOD)))

##### RA #####
ggplot(method_totals, aes(x = reorder(METHOD, -Relative_Abundance), y = Relative_Abundance)) +
  geom_col(fill = "tomato") +
  labs(
    title = "Relative abundance by sampling method",
    x = "Sampling method",
    y = "Relative abundance (%)") +
  theme_minimal()

# Matrix
method_species_table <- counts_thames %>%
  filter(!is.na(METHOD), !is.na(SPECIES_NAME)) %>%
  count(METHOD, SPECIES_NAME) %>%
  pivot_wider(names_from = METHOD, values_from = n, values_fill = 0)

method_pa <- method_species_table %>%
  mutate(across(-SPECIES_NAME, ~ as.integer(. > 0)))

method_cols <- names(method_pa)[!names(method_pa) %in% "SPECIES_NAME"]

method_unique <- method_pa %>%
  rowwise() %>%
  mutate(unique_to = if (sum(c_across(all_of(method_cols))) == 1) {
    method_cols[which(c_across(all_of(method_cols)) == 1)]
  } else {
    NA_character_
  }) %>%
  ungroup() %>%
  filter(!is.na(unique_to))

unique_counts <- method_unique %>%
  count(unique_to, name = "unique_species_count") %>%
  arrange(desc(unique_species_count))

ggplot(unique_counts, aes(x = reorder(unique_to, unique_species_count), y = unique_species_count, fill = unique_to)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Number of species unique to each method type",
    x = "Sampling method", y = "Unique species count") +
  theme_minimal()

method_pa_long <- method_pa %>%
  pivot_longer(cols = -SPECIES_NAME, names_to = "METHOD", values_to = "present")

ggplot(method_pa_long, aes(x = METHOD, y = SPECIES_NAME, fill = as.factor(present))) +
  geom_tile(color = "white") +
  scale_fill_manual(values = c("0" = "white", "1" = "steelblue")) +
  labs(
    title = "Species presence across sampling method",
    x = "Sampling method", y = "Species",
    fill = "Present") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 6))

Seine net data only.

# 1) Filter to Seine Net
counts_thames_filtered <- counts_thames %>%
  filter(METHOD == "Seine Net")

# 2) Per-survey CPUE (fish per run)
#    - If NO_OF_RUNS can vary within a survey, switch `first()` to `max()` and sanity-check.
seine_cpue <- counts_thames_filtered %>%
  filter(!is.na(FISH_COUNT), NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR) %>%
  summarise(
    total_fish = sum(FISH_COUNT, na.rm = TRUE),
    runs       = dplyr::first(NO_OF_RUNS),   # or max(NO_OF_RUNS)
    n_checks   = n_distinct(NO_OF_RUNS),     # for sanity checking
    .groups    = "drop"
  ) %>%
  mutate(CPUE = total_fish / runs)

# Optional sanity check:
# stopifnot(all(seine_cpue$n_checks == 1))

# 3) Yearly, survey-balanced summary (each survey = 1 vote)
cpue_year <- seine_cpue %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    sd_CPUE   = sd(CPUE, na.rm = TRUE),
    n_surveys = n(),
    se        = sd_CPUE / sqrt(n_surveys),
    .groups   = "drop"
  )

# 4) Quick overview plot (mean ± SE)
p1 <- ggplot(cpue_year, aes(x = EVENT_DATE_YEAR, y = mean_CPUE)) +
  geom_ribbon(aes(ymin = mean_CPUE - se, ymax = mean_CPUE + se),
              alpha = 0.2, fill = "steelblue") +
  geom_line(colour = "steelblue", linewidth = 1) +
  geom_point(colour = "steelblue", size = 2) +
  labs(
    title = "Mean seine net CPUE per year – Thames Estuary (survey-balanced)",
    x = "Year",
    y = "Mean CPUE (fish per run)"
  ) +
  theme_minimal()

# 5) Trend model on yearly means
lm1 <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = cpue_year)

cpue_year$pred    <- predict(lm1)
cpue_year$pred_se <- predict(lm1, se.fit = TRUE)$se.fit

s      <- summary(lm1)
slope  <- coef(s)["EVENT_DATE_YEAR","Estimate"]
pval   <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
lab    <- sprintf("LM slope = %.2f fish/run/yr (p = %.3f)", slope, pval)

# 6) Trend plot with fitted mean and 95% CI of the fit
p2 <- ggplot(cpue_year, aes(EVENT_DATE_YEAR, mean_CPUE)) +
  geom_point(size = 2) +
  geom_line(col = "grey40") +
  geom_ribbon(aes(ymin = pred - 1.96 * pred_se, ymax = pred + 1.96 * pred_se),
              fill = "lightblue", alpha = 0.35) +
  geom_line(aes(y = pred), col = "blue", linewidth = 1) +
  labs(
    title = "Trend in mean seine net CPUE – Thames Estuary (survey-balanced)",
    x = "Year", y = "Mean CPUE (fish per run)"
  ) +
  annotate("text",
           x = min(cpue_year$EVENT_DATE_YEAR, na.rm = TRUE) + 1,
           y = max(cpue_year$mean_CPUE, na.rm = TRUE),
           hjust = 0, vjust = 1, size = 3.5, label = lab) +
  theme_minimal()

p1

p2

# --- per-survey CPUE per species ---
species_svy_cpue <- counts_thames_filtered %>%
  filter(NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    count = sum(FISH_COUNT, na.rm = TRUE),
    runs  = dplyr::first(NO_OF_RUNS),
    .groups = "drop"
  ) %>%
  mutate(CPUE = count / runs)

# --- yearly, survey-balanced CPUE per species ---
species_trends <- species_svy_cpue %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    n_surveys = n(),
    .groups = "drop"
  )

# --- per-species trend (weights = n_surveys) ---
species_model_summary <- function(df) {
  if (dplyr::n_distinct(df$EVENT_DATE_YEAR) < 3 || sd(df$mean_CPUE, na.rm = TRUE) == 0) {
    return(tibble(Slope = NA_real_, CI_low = NA_real_, CI_high = NA_real_,
                  P_value = NA_real_, Significance = NA_character_))
  }
  m <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = df, weights = n_surveys)
  s  <- summary(m)
  slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
  se    <- coef(s)["EVENT_DATE_YEAR","Std. Error"]
  p     <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
  tibble(
    Slope = slope,
    CI_low  = slope - 1.96 * se,
    CI_high = slope + 1.96 * se,
    P_value = p,
    Significance = dplyr::case_when(
      p <= 0.001 ~ "***",
      p <= 0.01  ~ "**",
      p <= 0.05  ~ "*",
      TRUE       ~ ""
    )
  )
}

species_results <- species_trends %>%
  group_by(SPECIES_NAME) %>%
  group_modify(~ species_model_summary(.x)) %>%
  ungroup() %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns"))
  )

species_results_sig <- species_results %>%
  filter(P_value <= 0.05) %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*"),
                    labels = c("***","**","*"))
  )

# --- Define colour palette for significance ---
sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444"   # red
)

# --- Plot only significant species ---
ggplot(species_results_sig, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.3, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*")) +
  labs(
    title = "Significant species-level trends in abundance (CPUE) – Thames Estuary",
    subtitle = "Only species with p ≤ 0.05 shown. Slope ± 95% CI (weighted by number of surveys).",
    x = "Slope (Δ CPUE per year)", 
    y = "Species", 
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.direction = "horizontal",
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
  )

svy_comp <- counts_thames_filtered %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(n = sum(FISH_COUNT, na.rm = TRUE), .groups = "drop_last")

div_svy <- svy_comp %>%
  summarise(
    Species_Richness  = n_distinct(SPECIES_NAME),
    Total_Abundance   = sum(n),
    Shannon_Diversity = vegan::diversity(n, index = "shannon"),
    Simpson_Diversity = vegan::diversity(n, index = "simpson"),
    .groups = "drop"
  ) %>%
  mutate(
    Margalef_Richness = (Species_Richness - 1) / log(pmax(Total_Abundance, 1)),
    Pielou_Evenness   = Shannon_Diversity / log(pmax(Species_Richness, 2))
  )

diversity_summary <- div_svy %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    Richness  = mean(Species_Richness),  se_Richness  = sd(Species_Richness)/sqrt(n()),
    Shannon   = mean(Shannon_Diversity), se_Shannon   = sd(Shannon_Diversity)/sqrt(n()),
    Simpson   = mean(Simpson_Diversity), se_Simpson   = sd(Simpson_Diversity)/sqrt(n()),
    Margalef  = mean(Margalef_Richness), se_Margalef  = sd(Margalef_Richness)/sqrt(n()),
    Pielou    = mean(Pielou_Evenness),   se_Pielou    = sd(Pielou_Evenness)/sqrt(n()),
    n_surveys      = n(),
    .groups = "drop"
  )

diversity_long <- diversity_summary %>%
  select(EVENT_DATE_YEAR,
         Richness, Shannon, Simpson, Margalef, Pielou) %>%
  pivot_longer(cols = -EVENT_DATE_YEAR,
               names_to = "Metric", values_to = "Value")

ggplot(diversity_long, aes(x = EVENT_DATE_YEAR, y = Value)) +
  geom_line(color = "red", linewidth = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "black", linewidth = 1) +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(
    title = "Diversity metrics over time – Thames Estuary (survey-balanced)",
    subtitle = "Per-survey diversity averaged per year (LOESS trend)",
    x = "Year", y = "Diversity metric (mean)"
  ) +
  theme_minimal()

lm_richness <- lm(Richness  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_shannon  <- lm(Shannon   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_simpson  <- lm(Simpson   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_margalef <- lm(Margalef  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_evenness <- lm(Pielou    ~ EVENT_DATE_YEAR, data = diversity_summary)

summary_table <- function(model, metric_name) {
  coef_summary <- summary(model)$coefficients
  slope <- coef_summary["EVENT_DATE_YEAR", "Estimate"]
  se <- coef_summary["EVENT_DATE_YEAR", "Std. Error"]
  p <- coef_summary["EVENT_DATE_YEAR", intersect(c("Pr(>|t|)", "Pr(>|z|)"),
                                                 colnames(coef_summary))[1]]
  ci_low <- slope - 1.96 * se
  ci_high <- slope + 1.96 * se
  sig <- case_when(p <= 0.001 ~ "***", p <= 0.01 ~ "**", p <= 0.05 ~ "*", TRUE ~ "")
  data.frame(Metric = metric_name, Slope = slope, SE = se,
             CI_low = ci_low, CI_high = ci_high,
             P_value = round(p, 4), Significance = sig)
}

results <- bind_rows(
  summary_table(lm_richness,  "Species richness"),
  summary_table(lm_shannon,   "Shannon diversity"),
  summary_table(lm_simpson,   "Simpson diversity"),
  summary_table(lm_margalef,  "Margalef richness"),
  summary_table(lm_evenness,  "Pielou evenness")
)

results_lab <- results %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns")),
    Metric_lab = dplyr::case_when(
      Metric == "Species richness" ~ Metric,                 # never add stars here
      Significance == ""           ~ Metric,                 # ns → no stars
      TRUE                         ~ paste0(Metric, " ", Significance)
    )
  )

sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444",  # red
  "ns"  = "#9ca3af"   # grey
)

ggplot(results_lab, aes(x = Slope, y = reorder(Metric_lab, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.22, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*","ns")) +
  labs(
    title = "Trend in diversity metrics over time – Thames Estuary",
    subtitle = "Slope ± 95% CI from linear models on yearly survey-balanced means",
    x = "Slope (change per year)", y = "Diversity metric (mean)",
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(legend.position = "top",
        legend.direction = "horizontal",
        legend.title = element_text(size = 11),
        legend.text  = element_text(size = 10))

pa_svy <- counts_thames_filtered %>%
  filter(!is.na(SPECIES_NAME), !is.na(FISH_COUNT)) %>%
  mutate(PA = as.integer(FISH_COUNT > 0)) %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(present = as.integer(any(PA == 1)), .groups = "drop")

svy_per_year <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  count(EVENT_DATE_YEAR, name = "n_surveys")

min_svy <- min(svy_per_year$n_surveys)

set.seed(42)
sampled_surveys <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  group_by(EVENT_DATE_YEAR) %>%
  slice_sample(n = min_svy) %>%
  ungroup()

year_species_pa <- pa_svy %>%
  semi_join(sampled_surveys, by = c("EVENT_DATE_YEAR","SURVEY_ID")) %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(present = as.integer(any(present == 1)), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = SPECIES_NAME, values_from = present, values_fill = 0) %>%
  tibble::column_to_rownames("EVENT_DATE_YEAR") %>%
  as.data.frame()

beta_core <- betapart::betapart.core(year_species_pa)
beta_res  <- betapart::beta.pair(beta_core, index.family = "sor")

sor_mat <- as.matrix(beta_res$beta.sor)
sim_mat <- as.matrix(beta_res$beta.sim)
sne_mat <- as.matrix(beta_res$beta.sne)

years <- sort(as.numeric(rownames(sor_mat)))

beta_trends <- tibble::tibble(
  Year1 = years[-length(years)],
  Year2 = years[-1],
  beta_sor = purrr::map2_dbl(Year1, Year2, ~ sor_mat[as.character(.x), as.character(.y)]),
  beta_sim = purrr::map2_dbl(Year1, Year2, ~ sim_mat[as.character(.x), as.character(.y)]),
  beta_sne = purrr::map2_dbl(Year1, Year2, ~ sne_mat[as.character(.x), as.character(.y)])
)

lm_sor <- lm(beta_sor ~ Year2, data = beta_trends)
lm_sim <- lm(beta_sim ~ Year2, data = beta_trends)
lm_sne <- lm(beta_sne ~ Year2, data = beta_trends)

get_lm_stats <- function(model) {
  sm <- summary(model)
  list(r2 = round(sm$r.squared, 3),
       p  = round(sm$coefficients[2, 4], 3))
}

sor_stats <- get_lm_stats(lm_sor)
sim_stats <- get_lm_stats(lm_sim)
sne_stats <- get_lm_stats(lm_sne)

beta_trends_long <- beta_trends %>%
  tidyr::pivot_longer(c(beta_sor, beta_sim, beta_sne),
                      names_to = "Component", values_to = "Dissimilarity") %>%
  dplyr::mutate(Component = dplyr::recode(Component,
                                          beta_sor = "Sorensen", beta_sim = "Turnover", beta_sne = "Nestedness"))

subtitle_text <- paste0(
  "Sorensen: R² = ", sor_stats$r2, ", p = ", sor_stats$p, "   |   ",
  "Turnover: R² = ", sim_stats$r2, ", p = ", sim_stats$p, "   |   ",
  "Nestedness: R² = ", sne_stats$r2, ", p = ", sne_stats$p)


ggplot(beta_trends_long, aes(x = Year2, y = Dissimilarity, color = Component)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = c("Sorensen" = "darkorchid",
                                "Turnover" = "steelblue",
                                "Nestedness" = "darkorange")) +
  labs(
    title = "Year-to-year change in beta diversity components – Thames Estuary (survey-balanced)",
    subtitle = subtitle_text,
    x = "Year", y = "Dissimilarity", color = "Component"
  ) +
  theme_minimal()

Long-term seine net records from the Thames Estuary revealed a significant decline in overall fish abundance (CPUE) since the late 1980s, alongside a reduction in species richness, while other diversity indices showed no consistent temporal trend. Year-to-year community dissimilarity increased, driven primarily by species turnover, indicating progressive shifts in assemblage composition. Species-level trends showed rising abundances of freshwater-tolerant species such as roach, minnow, and bullhead, contrasted by declines in estuarine taxa including sand smelt, sea bass, and common goby.

EA length data

# --- Setup & load ---
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

lengths <- read_csv("TraC_Fish_Individual_Lengths_2025-10-10.csv")
sites   <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join + tidy
lengths_clean <- lengths %>%
  left_join(sites, by = "SITE_ID") %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    EVENT_DATE,
    SURVEY_ID,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_LENGTH,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING
  ) %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE),
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam")  ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick")  ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke")  ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push")  ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand")  ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill")  ~ "Gill Net",
      TRUE ~ "Unknown"
    )
  )

# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
         METHOD == "Seine Net",
         !is.na(FISH_LENGTH), FISH_LENGTH > 0)

# (Optional) Species-wise IQR outlier removal
lengths_thames_filtered <- lengths_thames_filtered %>%
  group_by(LATIN_NAME) %>%
  mutate(
    Q1 = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
    Q3 = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
    IQRv = Q3 - Q1,
    lo = Q1 - 1.5 * IQRv,
    hi = Q3 + 1.5 * IQRv
  ) %>%
  ungroup() %>%
  filter(FISH_LENGTH >= lo, FISH_LENGTH <= hi)
# Per-survey summaries (one row per survey)
survey_means <- lengths_thames_filtered %>%
  group_by(EVENT_YEAR, SURVEY_ID) %>%
  summarise(
    mean_len_survey   = mean(FISH_LENGTH),
    median_len_survey = median(FISH_LENGTH),
    n_fish            = n(),
    .groups = "drop"
  )

# Yearly (survey-balanced) summaries
survey_bal <- survey_means %>%
  group_by(EVENT_YEAR) %>%
  summarise(
    mean_length   = mean(mean_len_survey),      # equal weight per survey
    median_length = median(mean_len_survey),
    sd_mean       = sd(mean_len_survey),
    n_surveys     = n(),
    se_mean       = sd_mean / sqrt(n_surveys),
    .groups = "drop"
  ) %>%
  arrange(EVENT_YEAR)

# Linear trends on survey-balanced means
lm_mean   <- lm(mean_length   ~ EVENT_YEAR, data = survey_bal)
lm_median <- lm(median_length ~ EVENT_YEAR, data = survey_bal)

get_coef <- function(mod) {
  s <- summary(mod)$coefficients
  list(slope = unname(s["EVENT_YEAR","Estimate"]),
       p     = unname(s["EVENT_YEAR","Pr(>|t|)"]))
}
lab_mean   <- with(get_coef(lm_mean),   sprintf("Mean: slope = %.2f mm/yr (p = %.3g)",   slope, p))
lab_median <- with(get_coef(lm_median), sprintf("Median: slope = %.2f mm/yr (p = %.3g)", slope, p))

# Plot with top annotations
x_left <- min(survey_bal$EVENT_YEAR, na.rm = TRUE)
ggplot(survey_bal, aes(EVENT_YEAR)) +
  geom_ribbon(aes(ymin = mean_length - se_mean, ymax = mean_length + se_mean),
              alpha = 0.15, fill = "steelblue") +
  geom_line(aes(y = mean_length, color = "Mean"), linewidth = 1.2) +
  geom_point(aes(y = mean_length, color = "Mean"), size = 2) +
  geom_smooth(aes(y = mean_length, color = "Mean"), method = "lm", se = FALSE, linewidth = 0.9) +
  geom_line(aes(y = median_length, color = "Median"), linewidth = 1.2, linetype = "dashed") +
  geom_point(aes(y = median_length, color = "Median"), size = 2, shape = 17) +
  geom_smooth(aes(y = median_length, color = "Median"), method = "lm", se = FALSE, linetype = "dashed", linewidth = 0.9) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 1.4, label = lab_mean,   color = "steelblue",  size = 4) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 2.8, label = lab_median, color = "darkorange", size = 4) +
  scale_y_continuous(expand = expansion(mult = c(0.02, 0.18))) +
  scale_color_manual(values = c("Mean" = "steelblue", "Median" = "darkorange")) +
  labs(title = "Survey-balanced fish length trends – Thames Estuary",
       subtitle = "Mean ± SE (solid) and median (dashed) per year with linear trends",
       x = "Year", y = "Fish length (mm)", color = "Statistic") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top", panel.grid.minor = element_blank())

These declining estuarine species exhibited increasing body-size trends, suggesting reduced recruitment or juvenile survival but persistence of larger, older individuals within the population. Despite these compositional changes, overall mean fish length showed no significant long-term trend.

Together, these results suggest that the estuarine fish community has become less diverse, less abundant, and increasingly dominated by freshwater species.

EA subsites

Seven most frequently sampled sites.

# WD
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

# Load data
counts <- read_csv("TraC_Fish_Counts_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join count data with site information
counts_clean <- counts %>% left_join(sites, by = "SITE_ID") %>%
  mutate(EVENT_DATE = ymd(EVENT_DATE),
         EVENT_YEAR = year(EVENT_DATE),
         EVENT_MONTH = month(EVENT_DATE),
         SEASON = if_else(EVENT_MONTH %in% 4:9, "Summer", "Winter"))

counts_clean <- counts_clean %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    SURVEY_ID,
    EVENT_DATE,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_COUNT,
    NO_OF_RUNS,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING)

# Convert dates and extract components
counts_clean <- counts_clean %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_DATE_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE))

# Sampling method
counts_clean <- counts_clean %>%
  mutate(
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

# Thames only
counts_thames <- counts_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)))

counts_thames <- counts_thames %>%
  filter(SITE_PARENT_NAME %in% c(
    "Richmond",
    "Kew",
    "Battersea",
    "Greenwich",
    "West Thurrock",
    "Denton Wharf",
    "Stanford-le-Hope Beach"
  ))

# Lat and long
sites_sf <- st_as_sf(counts_thames, coords = c("SITE_RANKED_EASTING", "SITE_RANKED_NORTHING"), crs = 27700) %>%
  st_transform(crs = 4326)
coords <- st_coordinates(sites_sf)
sites_sf$lon <- coords[, 1]
sites_sf$lat <- coords[, 2]

# Gears
sites_sf <- sites_sf %>%
  mutate(
    METHOD_TYPE = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

##### Map #####
method_types <- unique(sites_sf$METHOD_TYPE)
method_pal <- colorFactor(palette = "Set2", domain = method_types)

leaflet(sites_sf) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    lng = ~lon, lat = ~lat,
    color = ~method_pal(METHOD_TYPE),
    radius = 6,
    stroke = TRUE, weight = 1, fillOpacity = 0.9,
    popup = ~paste0(
      "<b>Site:</b> ", SITE_NAME, "<br>",
      "<b>Zone:</b> ", TOP_TIER_SITE, "<br>",
      "<b>METHOD:</b> ", METHOD_TYPE), group = ~METHOD_TYPE) %>%
  addLegend(
    position = "bottomright",
    pal = method_pal,
    values = ~METHOD_TYPE,
    title = "Method type",
    opacity = 1) %>%
  addLayersControl(
    overlayGroups = method_types,
    options = layersControlOptions(collapsed = FALSE),
    position = "topright") %>%
  addResetMapButton()
# RA
method_totals <- counts_thames %>%
  filter(!is.na(METHOD)) %>%
  group_by(METHOD) %>%
  summarise(
    Total_Individuals = sum(FISH_COUNT, na.rm = TRUE),
    Total_Species = n_distinct(SPECIES_NAME),
    .groups = "drop")

# Get grand total
grand_total <- sum(method_totals$Total_Individuals)

# Add relative abundance column
method_totals <- method_totals %>%
  mutate(
    Relative_Abundance = round(100 * Total_Individuals / grand_total, 1)
  ) %>%
  arrange(desc(Relative_Abundance)) %>%
  mutate(METHOD = factor(METHOD, levels = unique(METHOD)))

##### RA #####
ggplot(method_totals, aes(x = reorder(METHOD, -Relative_Abundance), y = Relative_Abundance)) +
  geom_col(fill = "tomato") +
  labs(
    title = "Relative abundance by sampling method",
    x = "Sampling method",
    y = "Relative abundance (%)") +
  theme_minimal()

# Matrix
method_species_table <- counts_thames %>%
  filter(!is.na(METHOD), !is.na(SPECIES_NAME)) %>%
  count(METHOD, SPECIES_NAME) %>%
  pivot_wider(names_from = METHOD, values_from = n, values_fill = 0)

method_pa <- method_species_table %>%
  mutate(across(-SPECIES_NAME, ~ as.integer(. > 0)))

method_cols <- names(method_pa)[!names(method_pa) %in% "SPECIES_NAME"]

method_unique <- method_pa %>%
  rowwise() %>%
  mutate(unique_to = if (sum(c_across(all_of(method_cols))) == 1) {
    method_cols[which(c_across(all_of(method_cols)) == 1)]
  } else {
    NA_character_
  }) %>%
  ungroup() %>%
  filter(!is.na(unique_to))

unique_counts <- method_unique %>%
  count(unique_to, name = "unique_species_count") %>%
  arrange(desc(unique_species_count))

ggplot(unique_counts, aes(x = reorder(unique_to, unique_species_count), y = unique_species_count, fill = unique_to)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Number of species unique to each method type",
    x = "Sampling method", y = "Unique species count") +
  theme_minimal()

method_pa_long <- method_pa %>%
  pivot_longer(cols = -SPECIES_NAME, names_to = "METHOD", values_to = "present")

ggplot(method_pa_long, aes(x = METHOD, y = SPECIES_NAME, fill = as.factor(present))) +
  geom_tile(color = "white") +
  scale_fill_manual(values = c("0" = "white", "1" = "steelblue")) +
  labs(
    title = "Species presence across sampling method",
    x = "Sampling method", y = "Species",
    fill = "Present") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 6))

# 1) Filter to Seine Net
counts_thames_filtered <- counts_thames %>%
  filter(METHOD == "Seine Net")

# 2) Per-survey CPUE (fish per run)
#    - If NO_OF_RUNS can vary within a survey, switch `first()` to `max()` and sanity-check.
seine_cpue <- counts_thames_filtered %>%
  filter(!is.na(FISH_COUNT), NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR) %>%
  summarise(
    total_fish = sum(FISH_COUNT, na.rm = TRUE),
    runs       = dplyr::first(NO_OF_RUNS),   # or max(NO_OF_RUNS)
    n_checks   = n_distinct(NO_OF_RUNS),     # for sanity checking
    .groups    = "drop"
  ) %>%
  mutate(CPUE = total_fish / runs)

# Optional sanity check:
# stopifnot(all(seine_cpue$n_checks == 1))

# 3) Yearly, survey-balanced summary (each survey = 1 vote)
cpue_year <- seine_cpue %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    sd_CPUE   = sd(CPUE, na.rm = TRUE),
    n_surveys = n(),
    se        = sd_CPUE / sqrt(n_surveys),
    .groups   = "drop"
  )

# 4) Quick overview plot (mean ± SE)
p1 <- ggplot(cpue_year, aes(x = EVENT_DATE_YEAR, y = mean_CPUE)) +
  geom_ribbon(aes(ymin = mean_CPUE - se, ymax = mean_CPUE + se),
              alpha = 0.2, fill = "steelblue") +
  geom_line(colour = "steelblue", linewidth = 1) +
  geom_point(colour = "steelblue", size = 2) +
  labs(
    title = "Mean seine net CPUE per year – Thames Estuary (survey-balanced)",
    x = "Year",
    y = "Mean CPUE (fish per run)"
  ) +
  theme_minimal()

# 5) Trend model on yearly means
lm1 <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = cpue_year)

cpue_year$pred    <- predict(lm1)
cpue_year$pred_se <- predict(lm1, se.fit = TRUE)$se.fit

s      <- summary(lm1)
slope  <- coef(s)["EVENT_DATE_YEAR","Estimate"]
pval   <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
lab    <- sprintf("LM slope = %.2f fish/run/yr (p = %.3f)", slope, pval)

# 6) Trend plot with fitted mean and 95% CI of the fit
p2 <- ggplot(cpue_year, aes(EVENT_DATE_YEAR, mean_CPUE)) +
  geom_point(size = 2) +
  geom_line(col = "grey40") +
  geom_ribbon(aes(ymin = pred - 1.96 * pred_se, ymax = pred + 1.96 * pred_se),
              fill = "lightblue", alpha = 0.35) +
  geom_line(aes(y = pred), col = "blue", linewidth = 1) +
  labs(
    title = "Trend in mean seine net CPUE – Thames Estuary (survey-balanced)",
    x = "Year", y = "Mean CPUE (fish per run)"
  ) +
  annotate("text",
           x = min(cpue_year$EVENT_DATE_YEAR, na.rm = TRUE) + 1,
           y = max(cpue_year$mean_CPUE, na.rm = TRUE),
           hjust = 0, vjust = 1, size = 3.5, label = lab) +
  theme_minimal()

p1

p2

# --- per-survey CPUE per species ---
species_svy_cpue <- counts_thames_filtered %>%
  filter(NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    count = sum(FISH_COUNT, na.rm = TRUE),
    runs  = dplyr::first(NO_OF_RUNS),
    .groups = "drop"
  ) %>%
  mutate(CPUE = count / runs)

# --- yearly, survey-balanced CPUE per species ---
species_trends <- species_svy_cpue %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    n_surveys = n(),
    .groups = "drop"
  )

# --- per-species trend (weights = n_surveys) ---
species_model_summary <- function(df) {
  if (dplyr::n_distinct(df$EVENT_DATE_YEAR) < 3 || sd(df$mean_CPUE, na.rm = TRUE) == 0) {
    return(tibble(Slope = NA_real_, CI_low = NA_real_, CI_high = NA_real_,
                  P_value = NA_real_, Significance = NA_character_))
  }
  m <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = df, weights = n_surveys)
  s  <- summary(m)
  slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
  se    <- coef(s)["EVENT_DATE_YEAR","Std. Error"]
  p     <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
  tibble(
    Slope = slope,
    CI_low  = slope - 1.96 * se,
    CI_high = slope + 1.96 * se,
    P_value = p,
    Significance = dplyr::case_when(
      p <= 0.001 ~ "***",
      p <= 0.01  ~ "**",
      p <= 0.05  ~ "*",
      TRUE       ~ ""
    )
  )
}

species_results <- species_trends %>%
  group_by(SPECIES_NAME) %>%
  group_modify(~ species_model_summary(.x)) %>%
  ungroup() %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns"))
  )

species_results_sig <- species_results %>%
  filter(P_value <= 0.05) %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*"),
                    labels = c("***","**","*"))
  )

# --- Define colour palette for significance ---
sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444"   # red
)

# --- Plot only significant species ---
ggplot(species_results_sig, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.3, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*")) +
  labs(
    title = "Significant species-level trends in abundance (CPUE) – Thames Estuary",
    subtitle = "Only species with p ≤ 0.05 shown. Slope ± 95% CI (weighted by number of surveys).",
    x = "Slope (Δ CPUE per year)", 
    y = "Species", 
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.direction = "horizontal",
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
  )

svy_comp <- counts_thames_filtered %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(n = sum(FISH_COUNT, na.rm = TRUE), .groups = "drop_last")

div_svy <- svy_comp %>%
  summarise(
    Species_Richness  = n_distinct(SPECIES_NAME),
    Total_Abundance   = sum(n),
    Shannon_Diversity = vegan::diversity(n, index = "shannon"),
    Simpson_Diversity = vegan::diversity(n, index = "simpson"),
    .groups = "drop"
  ) %>%
  mutate(
    Margalef_Richness = (Species_Richness - 1) / log(pmax(Total_Abundance, 1)),
    Pielou_Evenness   = Shannon_Diversity / log(pmax(Species_Richness, 2))
  )

diversity_summary <- div_svy %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    Richness  = mean(Species_Richness),  se_Richness  = sd(Species_Richness)/sqrt(n()),
    Shannon   = mean(Shannon_Diversity), se_Shannon   = sd(Shannon_Diversity)/sqrt(n()),
    Simpson   = mean(Simpson_Diversity), se_Simpson   = sd(Simpson_Diversity)/sqrt(n()),
    Margalef  = mean(Margalef_Richness), se_Margalef  = sd(Margalef_Richness)/sqrt(n()),
    Pielou    = mean(Pielou_Evenness),   se_Pielou    = sd(Pielou_Evenness)/sqrt(n()),
    n_surveys      = n(),
    .groups = "drop"
  )

diversity_long <- diversity_summary %>%
  select(EVENT_DATE_YEAR,
         Richness, Shannon, Simpson, Margalef, Pielou) %>%
  pivot_longer(cols = -EVENT_DATE_YEAR,
               names_to = "Metric", values_to = "Value")

ggplot(diversity_long, aes(x = EVENT_DATE_YEAR, y = Value)) +
  geom_line(color = "red", linewidth = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "black", linewidth = 1) +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(
    title = "Diversity metrics over time – Thames Estuary (survey-balanced)",
    subtitle = "Per-survey diversity averaged per year (LOESS trend)",
    x = "Year", y = "Diversity metric (mean)"
  ) +
  theme_minimal()

lm_richness <- lm(Richness  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_shannon  <- lm(Shannon   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_simpson  <- lm(Simpson   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_margalef <- lm(Margalef  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_evenness <- lm(Pielou    ~ EVENT_DATE_YEAR, data = diversity_summary)

summary_table <- function(model, metric_name) {
  coef_summary <- summary(model)$coefficients
  slope <- coef_summary["EVENT_DATE_YEAR", "Estimate"]
  se <- coef_summary["EVENT_DATE_YEAR", "Std. Error"]
  p <- coef_summary["EVENT_DATE_YEAR", intersect(c("Pr(>|t|)", "Pr(>|z|)"),
                                                 colnames(coef_summary))[1]]
  ci_low <- slope - 1.96 * se
  ci_high <- slope + 1.96 * se
  sig <- case_when(p <= 0.001 ~ "***", p <= 0.01 ~ "**", p <= 0.05 ~ "*", TRUE ~ "")
  data.frame(Metric = metric_name, Slope = slope, SE = se,
             CI_low = ci_low, CI_high = ci_high,
             P_value = round(p, 4), Significance = sig)
}

results <- bind_rows(
  summary_table(lm_richness,  "Species richness"),
  summary_table(lm_shannon,   "Shannon diversity"),
  summary_table(lm_simpson,   "Simpson diversity"),
  summary_table(lm_margalef,  "Margalef richness"),
  summary_table(lm_evenness,  "Pielou evenness")
)

results_lab <- results %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns")),
    Metric_lab = dplyr::case_when(
      Metric == "Species richness" ~ Metric,                 # never add stars here
      Significance == ""           ~ Metric,                 # ns → no stars
      TRUE                         ~ paste0(Metric, " ", Significance)
    )
  )

sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444",  # red
  "ns"  = "#9ca3af"   # grey
)

ggplot(results_lab, aes(x = Slope, y = reorder(Metric_lab, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.22, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*","ns")) +
  labs(
    title = "Trend in diversity metrics over time – Thames Estuary",
    subtitle = "Slope ± 95% CI from linear models on yearly survey-balanced means",
    x = "Slope (change per year)", y = "Diversity metric (mean)",
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(legend.position = "top",
        legend.direction = "horizontal",
        legend.title = element_text(size = 11),
        legend.text  = element_text(size = 10))

pa_svy <- counts_thames_filtered %>%
  filter(!is.na(SPECIES_NAME), !is.na(FISH_COUNT)) %>%
  mutate(PA = as.integer(FISH_COUNT > 0)) %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(present = as.integer(any(PA == 1)), .groups = "drop")

svy_per_year <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  count(EVENT_DATE_YEAR, name = "n_surveys")

min_svy <- min(svy_per_year$n_surveys)

set.seed(42)
sampled_surveys <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  group_by(EVENT_DATE_YEAR) %>%
  slice_sample(n = min_svy) %>%
  ungroup()

year_species_pa <- pa_svy %>%
  semi_join(sampled_surveys, by = c("EVENT_DATE_YEAR","SURVEY_ID")) %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(present = as.integer(any(present == 1)), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = SPECIES_NAME, values_from = present, values_fill = 0) %>%
  tibble::column_to_rownames("EVENT_DATE_YEAR") %>%
  as.data.frame()

beta_core <- betapart::betapart.core(year_species_pa)
beta_res  <- betapart::beta.pair(beta_core, index.family = "sor")

sor_mat <- as.matrix(beta_res$beta.sor)
sim_mat <- as.matrix(beta_res$beta.sim)
sne_mat <- as.matrix(beta_res$beta.sne)

years <- sort(as.numeric(rownames(sor_mat)))

beta_trends <- tibble::tibble(
  Year1 = years[-length(years)],
  Year2 = years[-1],
  beta_sor = purrr::map2_dbl(Year1, Year2, ~ sor_mat[as.character(.x), as.character(.y)]),
  beta_sim = purrr::map2_dbl(Year1, Year2, ~ sim_mat[as.character(.x), as.character(.y)]),
  beta_sne = purrr::map2_dbl(Year1, Year2, ~ sne_mat[as.character(.x), as.character(.y)])
)

lm_sor <- lm(beta_sor ~ Year2, data = beta_trends)
lm_sim <- lm(beta_sim ~ Year2, data = beta_trends)
lm_sne <- lm(beta_sne ~ Year2, data = beta_trends)

get_lm_stats <- function(model) {
  sm <- summary(model)
  list(r2 = round(sm$r.squared, 3),
       p  = round(sm$coefficients[2, 4], 3))
}

sor_stats <- get_lm_stats(lm_sor)
sim_stats <- get_lm_stats(lm_sim)
sne_stats <- get_lm_stats(lm_sne)

beta_trends_long <- beta_trends %>%
  tidyr::pivot_longer(c(beta_sor, beta_sim, beta_sne),
                      names_to = "Component", values_to = "Dissimilarity") %>%
  dplyr::mutate(Component = dplyr::recode(Component,
                                          beta_sor = "Sorensen", beta_sim = "Turnover", beta_sne = "Nestedness"))

subtitle_text <- paste0(
  "Sorensen: R² = ", sor_stats$r2, ", p = ", sor_stats$p, "   |   ",
  "Turnover: R² = ", sim_stats$r2, ", p = ", sim_stats$p, "   |   ",
  "Nestedness: R² = ", sne_stats$r2, ", p = ", sne_stats$p)


ggplot(beta_trends_long, aes(x = Year2, y = Dissimilarity, color = Component)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = c("Sorensen" = "darkorchid",
                                "Turnover" = "steelblue",
                                "Nestedness" = "darkorange")) +
  labs(
    title = "Year-to-year change in beta diversity components – Thames Estuary (survey-balanced)",
    subtitle = subtitle_text,
    x = "Year", y = "Dissimilarity", color = "Component"
  ) +
  theme_minimal()

# --- Setup & load ---
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

lengths <- read_csv("TraC_Fish_Individual_Lengths_2025-10-10.csv")
sites   <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join + tidy
lengths_clean <- lengths %>%
  left_join(sites, by = "SITE_ID") %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    EVENT_DATE,
    SURVEY_ID,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_LENGTH,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING
  ) %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE),
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam")  ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick")  ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke")  ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push")  ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand")  ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill")  ~ "Gill Net",
      TRUE ~ "Unknown"
    )
  )

# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
         METHOD == "Seine Net",
         !is.na(FISH_LENGTH), FISH_LENGTH > 0)


lengths_thames_filtered2 <- lengths_thames_filtered %>%
  filter(SITE_PARENT_NAME %in% c(
    "Richmond",
    "Kew",
    "Battersea",
    "Greenwich",
    "West Thurrock",
    "Denton Wharf",
    "Stanford-le-Hope Beach"
  ))

# (Optional) Species-wise IQR outlier removal
lengths_thames_filtered <- lengths_thames_filtered2 %>%
  group_by(LATIN_NAME) %>%
  mutate(
    Q1 = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
    Q3 = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
    IQRv = Q3 - Q1,
    lo = Q1 - 1.5 * IQRv,
    hi = Q3 + 1.5 * IQRv
  ) %>%
  ungroup() %>%
  filter(FISH_LENGTH >= lo, FISH_LENGTH <= hi)


# Per-survey summaries (one row per survey)
survey_means <- lengths_thames_filtered %>%
  group_by(EVENT_YEAR, SURVEY_ID) %>%
  summarise(
    mean_len_survey   = mean(FISH_LENGTH),
    median_len_survey = median(FISH_LENGTH),
    n_fish            = n(),
    .groups = "drop"
  )

# Yearly (survey-balanced) summaries
survey_bal <- survey_means %>%
  group_by(EVENT_YEAR) %>%
  summarise(
    mean_length   = mean(mean_len_survey),      # equal weight per survey
    median_length = median(mean_len_survey),
    sd_mean       = sd(mean_len_survey),
    n_surveys     = n(),
    se_mean       = sd_mean / sqrt(n_surveys),
    .groups = "drop"
  ) %>%
  arrange(EVENT_YEAR)

# Linear trends on survey-balanced means
lm_mean   <- lm(mean_length   ~ EVENT_YEAR, data = survey_bal)
lm_median <- lm(median_length ~ EVENT_YEAR, data = survey_bal)

get_coef <- function(mod) {
  s <- summary(mod)$coefficients
  list(slope = unname(s["EVENT_YEAR","Estimate"]),
       p     = unname(s["EVENT_YEAR","Pr(>|t|)"]))
}
lab_mean   <- with(get_coef(lm_mean),   sprintf("Mean: slope = %.2f mm/yr (p = %.3g)",   slope, p))
lab_median <- with(get_coef(lm_median), sprintf("Median: slope = %.2f mm/yr (p = %.3g)", slope, p))

# Plot with top annotations
x_left <- min(survey_bal$EVENT_YEAR, na.rm = TRUE)
ggplot(survey_bal, aes(EVENT_YEAR)) +
  geom_ribbon(aes(ymin = mean_length - se_mean, ymax = mean_length + se_mean),
              alpha = 0.15, fill = "steelblue") +
  geom_line(aes(y = mean_length, color = "Mean"), linewidth = 1.2) +
  geom_point(aes(y = mean_length, color = "Mean"), size = 2) +
  geom_smooth(aes(y = mean_length, color = "Mean"), method = "lm", se = FALSE, linewidth = 0.9) +
  geom_line(aes(y = median_length, color = "Median"), linewidth = 1.2, linetype = "dashed") +
  geom_point(aes(y = median_length, color = "Median"), size = 2, shape = 17) +
  geom_smooth(aes(y = median_length, color = "Median"), method = "lm", se = FALSE, linetype = "dashed", linewidth = 0.9) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 1.4, label = lab_mean,   color = "steelblue",  size = 4) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 2.8, label = lab_median, color = "darkorange", size = 4) +
  scale_y_continuous(expand = expansion(mult = c(0.02, 0.18))) +
  scale_color_manual(values = c("Mean" = "steelblue", "Median" = "darkorange")) +
  labs(title = "Survey-balanced fish length trends – Thames Estuary",
       subtitle = "Mean ± SE (solid) and median (dashed) per year with linear trends",
       x = "Year", y = "Fish length (mm)", color = "Statistic") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top", panel.grid.minor = element_blank())

# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
         METHOD == "Seine Net",
         !is.na(FISH_LENGTH), FISH_LENGTH > 0)


lengths_thames_filtered2 <- lengths_thames_filtered %>%
  filter(SITE_PARENT_NAME %in% c(
    "Richmond",
    "Kew",
    "Battersea",
    "Greenwich",
    "West Thurrock",
    "Denton Wharf",
    "Stanford-le-Hope Beach"
  ))

len_dat <- lengths_thames_filtered2 %>%
  mutate(LATIN_NAME = str_squish(LATIN_NAME)) %>%
  filter(!is.na(FISH_LENGTH), FISH_LENGTH > 0,
         !is.na(EVENT_YEAR), !is.na(SITE_ID)) %>%
  mutate(Yc = EVENT_YEAR - mean(EVENT_YEAR, na.rm = TRUE))

# keep species with enough temporal info overall
keep <- len_dat %>%
  group_by(LATIN_NAME) %>%
  summarise(n_years = n_distinct(EVENT_YEAR), n = n(), .groups = "drop") %>%
  filter(n_years >= 3, n >= 30) %>%
  pull(LATIN_NAME)

dat_keep <- len_dat %>% filter(LATIN_NAME %in% keep)

fit_one <- function(df) {
  n_obs   <- nrow(df)
  n_sites <- dplyr::n_distinct(df$SITE_ID)
  max_rep <- if (n_obs == 0) 0 else df %>% count(SITE_ID, name = "n") %>% pull(n) %>% max(na.rm = TRUE)
  use_lmm <- (max_rep >= 2) && (n_sites < n_obs)
  
  if (use_lmm) {
    fit <- try(suppressWarnings(lmer(FISH_LENGTH ~ Yc + (1|SITE_ID), data = df, REML = TRUE)), silent = TRUE)
    if (inherits(fit, "try-error")) return(NULL)
    
    s  <- summary(fit)
    co <- coef(s)
    
    # extract slope/SE
    beta <- unname(co["Yc","Estimate"])
    se   <- unname(co["Yc","Std. Error"])
    
    # try to get p directly; otherwise compute from t as normal approx
    pcol <- intersect(colnames(co), c("Pr(>|t|)", "Pr(>|z|)"))
    if (length(pcol) == 1) {
      pval <- unname(co["Yc", pcol])
    } else {
      tcol <- intersect(colnames(co), c("t value","z value","t.value","z.value"))
      tval <- unname(co["Yc", tcol])
      # Normal approx fallback (conservative would use Satterthwaite via lmerTest)
      pval <- 2 * pnorm(abs(tval), lower.tail = FALSE)
    }
    
    r2_m <- r2_c <- NA_real_
    r2 <- try(suppressWarnings(suppressMessages(performance::r2_nakagawa(fit))), silent = TRUE)
    if (!inherits(r2, "try-error")) { r2_m <- r2$R2_marginal; r2_c <- r2$R2_conditional }
    
    return(tibble(
      model = "LMM (1|SITE)",
      Slope = beta,
      SE    = se,
      p     = pval,
      r2_m  = r2_m,
      r2_c  = r2_c,
      n_obs = n_obs, n_sites = n_sites, max_rep = max_rep
    ))
  }
  
  # LM fallback
  fit <- try(suppressWarnings(lm(FISH_LENGTH ~ Yc, data = df)), silent = TRUE)
  if (inherits(fit, "try-error")) return(NULL)
  s  <- summary(fit); co <- coef(s)
  
  tibble(
    model = "LM",
    Slope = unname(co["Yc","Estimate"]),
    SE    = unname(co["Yc","Std. Error"]),
    p     = unname(co["Yc","Pr(>|t|)"]),
    r2_m  = s$r.squared,
    r2_c  = NA_real_,
    n_obs = nrow(df),
    n_sites = dplyr::n_distinct(df$SITE_ID),
    max_rep = if (nrow(df) > 0) df %>% count(SITE_ID, name = "n") %>% pull(n) %>% max(na.rm = TRUE) else NA_real_
  )
}


safe_fit_one <- function(df) {
  out <- try(fit_one(df), silent = TRUE)
  if (inherits(out, "try-error") || is.null(out)) {
    tibble(
      model   = NA_character_,
      Slope   = NA_real_, SE = NA_real_, p = NA_real_,
      r2_m    = NA_real_, r2_c = NA_real_,
      n_obs   = nrow(df),
      n_sites = dplyr::n_distinct(df$SITE_ID),
      max_rep = if (nrow(df) > 0) df %>% count(SITE_ID, name="n") %>% pull(n) %>% max(na.rm=TRUE) else NA_real_
    )
  } else out
}

species_size_trends <- dat_keep %>%
  group_by(LATIN_NAME, SPECIES_NAME) %>%
  group_split(.keep = TRUE) %>%
  purrr::map_dfr(function(df) {
    safe_fit_one(df) %>%
      mutate(
        LATIN_NAME   = df$LATIN_NAME[1],
        SPECIES_NAME = df$SPECIES_NAME[1]
      )
  }) %>%
  relocate(LATIN_NAME, SPECIES_NAME) %>%
  ungroup() %>%
  mutate(
    Sig = dplyr::case_when(
      !is.na(p) & p <= 0.001 ~ "***",
      !is.na(p) & p <= 0.01  ~ "**",
      !is.na(p) & p <= 0.05  ~ "*",
      TRUE ~ ""
    )
  )

# Significant only, by salinity class
sig_trends <- species_size_trends %>% filter(p <= 0.05)


# --- Keep only significant species (p ≤ 0.05)
sig_trends <- species_size_trends %>%
  filter(p <= 0.05) %>%
  mutate(
    SigCat = factor(Sig,
                    levels = c("***", "**", "*"),
                    labels = c("***", "**", "*"))
  )

# --- Define the same significance colour palette ---
sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444"   # red
)

# --- Plot: consistent significance colours ---
ggplot(sig_trends, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = Slope - SE, xmax = Slope + SE),
                 height = 0.25, color = "gray60") +
  geom_point(aes(color = SigCat, shape = model), size = 3) +
  scale_color_manual(values = sig_cols, name = "Significance",
                     breaks = c("***", "**", "*")) +
  scale_shape_manual(values = c("LMM (1|SITE)" = 16, "LM" = 1),
                     name = "Model") +
  labs(
    title = "Significant species body-size trends – Thames Estuary",
    subtitle = "Slope ± SE (mm/year). LMM used when sites have replication; else LM fallback.",
    x = "Change in mean length (mm per year)", 
    y = "Species",
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    legend.box = "horizontal",
    legend.title = element_text(size = 11),
    legend.text  = element_text(size = 10)
  )

Long-term seine net data from the seven most consistently surveyed Thames Estuary sites (Richmond, Kew, Battersea, Greenwich, West Thurrock, Denton Wharf, and Stanford-le-Hope Beach) reveal a significant long-term decline in overall fish abundance (CPUE) since the late 1980s. This decline is accompanied by a reduction in species richness and a downward trend in diversity indices (Shannon, Simpson, and Richness), although evenness (Pielou) remained relatively stable.

Species-level analyses indicate declining abundances of key estuarine taxa such as sand smelt, flounder, bleak, and common goby, contrasted by increasing abundances of freshwater-tolerant species including minnow.

Despite these compositional and abundance declines, mean body size increased significantly for many species, particularly euryhaline taxa such as grey mullet species, sea bass, sand goby, sand smelt, and smelt, while only a few (notably European eels > elvers and thick-lipped grey mullet) exhibited decreases.

Beta-diversity analyses further indicate increasing year-to-year dissimilarity, driven largely by species turnover, suggesting that the estuarine assemblage has become progressively restructured rather than uniformly diminished.

These results suggest that the Thames Estuary fish community has become less abundant, less diverse, and compositionally altered, yet with a shift toward larger-bodied individuals.


Overall, the Thames Estuary fish community has shifted toward greater dominance by freshwater-tolerant species, while remaining estuarine and euryhaline taxa exhibit increasing body sizes.


Thames Fisheries data

Thames Fisheries Experiment - angling data

Long-term angling records from the Thames Estuary indicate that overall catch-per-unit-effort (CPUE) has remained broadly stable over the past five decades (LM slope = 0.03 fish per competitor per year, p = 0.392). While total catches show strong interannual variability—likely reflecting variation in fishing effort, environmental conditions, and species availability—there is no significant long-term directional change in mean CPUE.

At the species level, trends reveal contrasting trajectories among common angling targets. Sea bass and whiting show significant positive trends, indicating increasing relative abundance or catchability through time, whereas flounder, cod/codling, and European eel exhibit significant declines. These patterns suggest a gradual shift in the composition of the recreational fish assemblage toward more marine species.

Body-size analyses based on recorded angling lengths between 2013 and 2025 show no significant overall trend in mean or median fish size, though several species (e.g., bass, pouting, whiting) display individual directional shifts.

Overall, the angling dataset indicates a stable but compositionally changing fish community, with increasing prominence of sea bass and whiting offset by declines in traditional estuarine species such as flounder and eel.


Together, the datasets suggest a size–salinity gradient in community composition, with smaller and more abundant freshwater-tolerant species contrasting with larger but less numerous marine and euryhaline taxa.